# File:     topic_model_analysis.R
# Purpose:  This script examines the degree to which Yellen talks about different things
# Input:    /Data/finalData.RData
#           /Data/Hearings/topic_models_40.RData
# Output:   /Paper/Figures/topics_interruptions_loadings.pdf
#           /Paper/Figures/topics_interruptions.pdf
# Author:   JB


rm(list = ls())
require(lme4)
require(lfe)
require(tidyverse)
require(ggridges)


# Loading Data
load('../Data/finalData.RData')
load('../../../../Data/Hearings/topic_models_100.RData')
topWord <- lda_model$get_top_words(n = 50) %>%
  data.frame() %>%
  rename_all(function(x) gsub('X','topic_',x)) %>%
  mutate(top_word = row_number()) %>%
  as_tibble()


# Looking at which topics are interrupted the most
toplot <- utterance_level %>%
  filter(nchars > 0,
         ind > mind,
         !grepl("Yellen",speaker)) %>%
  group_by(interrupted) %>%
  summarise_at(vars(matches('topic_')),mean,na.rm=T) %>%
  gather(key,value,-interrupted) %>%
  filter(!grepl('lag',key)) %>%
  left_join(topWord %>%
              gather(key,term) %>%
              group_by(key) %>%
              slice(1:3) %>%
              summarise(terms = paste(term,collapse = ', ')))

p1 <- toplot %>%
  spread(interrupted,value) %>%
  mutate(diff = `1` - `0`) %>%
  mutate(col = ifelse(diff > 0,'red','blue')) %>%
  ggplot(aes(x = diff,y = reorder(terms,diff))) + 
  geom_bar(stat = 'identity',alpha = .5) + 
  xlab('Pr(topic | interrupted) - Pr(topic | not interrupted)') + 
  theme_ridges() + ylab('') +
  theme(axis.text.y = element_text(size = 10))




# Who uses these topics most?
toplot2 <- utterance_level %>%
  mutate(tmpSpk = ifelse(grepl('Yellen',speaker),'Yellen',
                         ifelse(grepl('Bernanke|Greenspan|Powell',speaker),'Other Fed Chairs','All Others'))) %>%
  filter(nchars > 0,
         ind > mind) %>%
  select(tmpSpk,fullInd,matches('topic_\\d+$')) %>%
  gather(topic,theta,-tmpSpk,-fullInd) %>%
  # group_by(fullInd) %>%
  # filter(theta == max(theta)) %>%
  group_by(tmpSpk,topic) %>%
  summarise(m = mean(theta),
            sd = sd(theta),
            n=n()) %>%
  left_join(topWord %>%
              gather(topic,term) %>%
              group_by(topic) %>%
              slice(1:3) %>%
              summarise(terms = paste(term,collapse = ', '))) 

toplot2 <- toplot %>%
  spread(interrupted,value) %>%
  mutate(diffInt = `1` - `0`) %>%
  left_join(toplot2) 

toplot3 <- toplot2 %>%
  left_join(toplot2 %>%
  filter(tmpSpk != 'All Others') %>%
  select(tmpSpk,terms,m) %>%
  spread(tmpSpk,m))

require(ggrepel)
(p2 <- toplot3 %>%
    ggplot(aes(x = m,y = reorder(terms,diffInt),color = tmpSpk,size = n,label = tmpSpk,shape = tmpSpk)) + 
    geom_point(alpha = .7) + 
    geom_segment(data = toplot3 %>% filter(tmpSpk == 'Yellen',
                                           Yellen > `Other Fed Chairs`),
                 aes(x = `Other Fed Chairs`,y = reorder(terms,diffInt),xend = Yellen,yend = reorder(terms,diffInt)),
                 size = 1) + 
    geom_segment(data = toplot3 %>% filter(tmpSpk == 'Other Fed Chairs',
                                           Yellen < `Other Fed Chairs`),
                 aes(x = `Other Fed Chairs`,y = reorder(terms,diffInt),xend = Yellen,yend = reorder(terms,diffInt)),
                 size = 1) + 
    theme_ridges() + xlab('Average topic by speaker') + 
    scale_size_continuous(guide = 'none') + 
    scale_shape_manual(name = 'Speaker',values = c(21,18,19)) + 
    scale_color_discrete(name = 'Speaker') + 
    # theme(legend.position = 'none') + 
    ylab('') + 
    geom_text_repel(data = toplot2 %>% filter(terms == 'gentleman, mr, time'),size = 3))


toplot4 <- toplot3 %>%
  left_join(doc_topic_distr %>%
              mutate(topic = paste0('topic_',topic)) %>%
  group_by(topic) %>%
  summarise(theta = mean(theta))) %>%
  mutate(diffTop = Yellen - `Other Fed Chairs`) %>%
  select(terms,diffInt,diffTop,theta,topic) %>% distinct()

require(gridExtra)
require(cowplot)
pdf('../Paper/Figures/topics_interruptions_loadings.pdf',width = 9,height = 7)
cowplot::plot_grid(p1,p2 + scale_y_discrete(labels = NULL),rel_widths = c(.6,.4))
dev.off()

# This is stupid...do this as a scatter plot you goon
require(ggrepel)

substantive <- paste0('topic_',c(1,4,6,7,8,10,12,14,18,20,24,26,32,34,38,41,42,43,46,49,
                                 50,52,53,55,58,61,62,63,65,68,72,77,80,83,85,89,90,95,96,100))

pdf('../output/figures/topics_interruptions_loadings.pdf',width = 7,height = 7)
toplot4 %>%
  filter(terms != 'issue, get, type') %>%
  ggplot(aes(x = diffTop,y = diffInt,label = terms,size = theta,weight= theta)) + 
  geom_point(shape = 21) + 
  geom_smooth(show.legend = F) + 
  geom_vline(xintercept = 0,linetype = 'dashed') + 
  geom_hline(yintercept = 0,linetype = 'dashed') +  
  ylab(bquote('' %<-% ' Topic Interrupted Less ... Topic Interrupted More ' %->% '')) + 
  xlab(bquote('' %<-% ' Topic Used More by Fed Chairs ... Topic Used More by Yellen ' %->% '')) + 
  theme_ridges() + 
  scale_size_continuous(name = 'Prevalence') + 
  theme(axis.title.x = element_text(hjust = .5,vjust = 0),
        axis.title.y = element_text(hjust = .5,vjust = 0),
        legend.position = 'bottom') + 
  geom_text_repel(data = toplot4 %>%
                    filter(topic %in% substantive | abs(diffInt) > .01),
                  size = 3)
dev.off()


utterance_level %>% 
  # filter(ind > mind) %>% 
  mutate(tmpSpk = ifelse(grepl('Yellen|Bernanke|Powell|Greenspan',speaker),
                         gsub('MrPowell','Powell',gsub('.*? |\\.','',speaker)),'Others')) %>%
  select(fullInd,interrupted,nchars,tmpSpk,matches('topic_\\d+$')) %>%
  gather(topic,theta,-fullInd,-interrupted,-nchars,-tmpSpk) %>%
  group_by(fullInd,interrupted) %>%
  filter(theta == max(theta)) %>%
  slice(1) %>%
  ungroup() %>%
  arrange(fullInd) %>%
  group_by(fullInd) %>%
  mutate(n=n()) %>%
  arrange(-n)

toplot <- utterance_level %>% 
  filter(ind > mind) %>%
  mutate(tmpSpk = ifelse(grepl('FED',opensecretsID),as.character(opensecretsID),'Others')) %>%
  mutate(tmpSpk = ifelse(grepl('YELLEN',tmpSpk),'Yellen',
                         ifelse(grepl('FED',tmpSpk),'Male Fed Chairs','Others'))) %>%
  arrange(fullInd) %>%
  select(fullInd,interrupted,nchars,tmpSpk,matches('topic_\\d+$')) %>%
  gather(topic,theta,-fullInd,-interrupted,-nchars,-tmpSpk) %>%
  group_by(fullInd,interrupted) %>%
  filter(theta == max(theta)) %>%
  slice(1) %>%
  ungroup() %>%
  arrange(fullInd) %>% distinct() %>%
  group_by(topic,tmpSpk) %>%
  summarise(int = mean(interrupted),
            meanTheta = mean(theta,na.rm=T),
            meanChars = mean(nchars,na.rm=T), 
            n=n()) %>%
  group_by(tmpSpk) %>%
  mutate(tot = sum(n),
         avgInt = mean(int)) %>%
  ungroup() %>%
  mutate(share = n/tot) %>%
  left_join(topWord %>%
              gather(topic,term) %>%
              group_by(topic) %>%
              slice(1:5) %>%
              group_by(topic) %>%
              summarise(terms = paste(term,collapse = ', '))) %>%
  mutate(terms = factor(terms,levels = toplot %>% filter(tmpSpk == 'Yellen') %>% arrange(int) %>% .$terms),
         topic = factor(topic,levels = toplot %>% filter(tmpSpk == 'Yellen') %>% arrange(int) %>% .$topic)) %>%
  filter(terms != 'NA',
         tmpSpk != 'Others') 

pdf('../output/figures/topics_interruptions.pdf',width = 7,height = 8)
toplot %>%
  ggplot(aes(x = int,y = topic,size = share,fill = tmpSpk,shape = tmpSpk)) + 
  geom_point(alpha = 1,size = 1) + 
  geom_vline(data = toplot %>% select(tmpSpk,avgInt) %>% distinct(),aes(xintercept = avgInt,color = tmpSpk)) +
  scale_size_continuous(name = '% of Utterances',range = c(1,10),labels = scales::percent) +
  theme_ridges() + 
  # scale_fill_discrete(name = 'Speaker') +
  scale_shape_manual(guide = 'none',name = 'Speaker',values = 21:25) + 
  geom_segment(data = toplot %>%
                 select(topic,tmpSpk,int) %>%
                 spread(tmpSpk,int) %>%
                 filter(Yellen > `Male Fed Chairs`),
               aes(x = Yellen,y = topic,xend = `Male Fed Chairs`,yend = topic),
               size = .5,color = 'darkorange',inherit.aes = F) + 
  geom_segment(data = toplot %>%
                 select(topic,tmpSpk,int) %>%
                 spread(tmpSpk,int) %>%
                 filter(Yellen <= `Male Fed Chairs`),
               aes(x = Yellen,y = topic,xend = `Male Fed Chairs`,yend = topic),
               size = .5,color = 'black',inherit.aes = F) + 
  geom_point(alpha = .35) + 
  geom_text(data = toplot %>%
              select(topic,terms,tmpSpk,int) %>%
              spread(tmpSpk,int) %>%
              filter(topic %in% substantive) %>%
              filter(Yellen > `Male Fed Chairs`),
            aes(x = Yellen,y = topic,label = terms),inherit.aes = F,
            size = 2,hjust = -.1) + 
  geom_text(data = toplot %>%
              select(topic,terms,tmpSpk,int) %>%
              spread(tmpSpk,int) %>%
              # filter(topic %in% substantive) %>%
              filter(Yellen <= `Male Fed Chairs`),
            aes(x = `Male Fed Chairs`,y = topic,label = terms),inherit.aes = F,
            size = 2,hjust = -.1) + 
  scale_fill_manual(guide = 'none',name = 'Speaker',values = c('black','darkorange')) + 
  scale_x_continuous(breaks = c(0,.25,.5,.75,1),labels = scales::percent) + 
  theme(legend.position = c(.7,.15),
        axis.title.x = element_text(hjust = .6),
        panel.grid.major.x = element_line(linewidth = .1),
        panel.grid.major.y = element_blank(),
        legend.title = element_text(size = 10),legend.text = element_text(size = 9),
        axis.text.y = element_blank()) + #,
        # plot.margin = unit(c(3,1,1,1),'lines'),legend.title = element_text(size = 10),legend.text = element_text(size = 9)) + 
  xlab('Proportion Interrupted') + ylab('Topic') + 
  scale_color_manual(guide = 'none',name = 'Speaker',values = c('black','darkorange')) + 
  scale_linetype_discrete(guide = 'none',name = 'Speaker') + 
  geom_text(data = toplot %>% select(tmpSpk,avgInt) %>% distinct(),
            aes(x = avgInt,y = c(100,102),label = tmpSpk,color = tmpSpk),
            inherit.aes = F,hjust = 0,size = 3.5,show.legend = FALSE) +
  coord_cartesian(ylim = c(0,103),xlim = c(0,1.5),clip = 'off')
dev.off()

# toplot <- utterance_level %>% 
#   filter(ind > mind) %>%
#   mutate(tmpSpk = ifelse(grepl('Yellen|Bernanke|Powell|Greenspan',speaker),
#                          gsub('MrPowell','Powell',gsub('.*? |\\.','',speaker)),'Others')) %>%
#   select(fullInd,interrupted,nchars,tmpSpk,matches('topic_\\d+$')) %>%
#   gather(topic,theta,-fullInd,-interrupted,-nchars,-tmpSpk) %>%
#   group_by(fullInd,interrupted) %>%
#   filter(theta == max(theta)) %>%
#   slice(1) %>%
#   ungroup() %>%
#   arrange(fullInd) %>% distinct() %>%
#   group_by(topic,tmpSpk) %>%
#   summarise(int = mean(interrupted),
#             meanTheta = mean(theta,na.rm=T),
#             meanChars = mean(nchars,na.rm=T), 
#             n=n()) %>%
#   group_by(tmpSpk) %>%
#   mutate(tot = sum(n),
#          avgInt = mean(int)) %>%
#   ungroup() %>%
#   mutate(share = n/tot) %>%
#   left_join(topWord %>%
#               gather(topic,term) %>%
#               group_by(topic) %>%
#               slice(1:5) %>%
#               group_by(topic) %>%
#               summarise(terms = paste(term,collapse = ', '))) 
# 
# 
# toplot %>%
#   mutate(terms = factor(terms,levels = toplot %>% filter(tmpSpk == 'Yellen') %>% arrange(int) %>% .$terms)) %>%
#   filter(terms != 'NA') %>%
#   ggplot(aes(x = int,y = terms,fill = tmpSpk,shape = tmpSpk)) + 
#   geom_point(alpha = .5) + 
#   # scale_size_continuous(name = '% of Utterances',range = c(1,10)) + 
#   theme_ridges() + 
#   scale_fill_discrete(name = 'Speaker') +
#   scale_shape_manual(name = 'Speaker',values = 21:25) + 
#   theme(legend.position = 'right',
#         axis.text.y = element_text(size = 6),
#         plot.margin = unit(c(3,1,1,1),'lines'),legend.title = element_text(size = 10),legend.text = element_text(size = 9)) + 
#   xlab('Proportion Interrupted') + ylab('') + 
#   geom_vline(aes(xintercept = avgInt,color = tmpSpk,linetype = tmpSpk)) + 
#   scale_color_discrete(name = 'Speaker') + 
#   scale_linetype_discrete(name = 'Speaker') + 
#   geom_text(data = toplot %>% select(tmpSpk,avgInt) %>% distinct(),aes(x = avgInt,y = c(104,102,100,106,108),label = tmpSpk,color = tmpSpk),
#             inherit.aes = F,hjust = 0,size = 3.5,show.legend = FALSE) + 
#   coord_cartesian(ylim = c(0,108),clip = 'off')
# 
# pdf('../Paper/Figures/topics_interruptions.pdf',width = 8,height = 7.5)
# toplot %>%
#   mutate(terms = factor(terms,levels = toplot %>% filter(tmpSpk == 'Yellen') %>% arrange(int) %>% .$terms)) %>%
#   filter(terms != 'NA') %>%
#   ggplot(aes(x = int,y = terms,fill = tmpSpk,size = share,shape = tmpSpk)) + 
#   geom_point(alpha = .5) + 
#   scale_size_continuous(name = '% of Utterances',range = c(1,10)) + 
#   theme_ridges() + 
#   scale_fill_discrete(name = 'Speaker') +
#   scale_shape_manual(name = 'Speaker',values = 21:25) + 
#   theme(legend.position = 'right',
#         axis.text.y = element_text(size = 6),
#         plot.margin = unit(c(3,1,1,1),'lines'),legend.title = element_text(size = 10),legend.text = element_text(size = 9)) + 
#   xlab('Proportion Interrupted') + ylab('') + 
#   geom_vline(aes(xintercept = avgInt,color = tmpSpk,linetype = tmpSpk)) + 
#   scale_color_discrete(name = 'Speaker') + 
#   scale_linetype_discrete(name = 'Speaker') + 
#   geom_text(data = toplot %>% select(tmpSpk,avgInt) %>% distinct(),aes(x = avgInt,y = c(104,102,100,106,108),label = tmpSpk,color = tmpSpk),
#             inherit.aes = F,hjust = 0,size = 3.5,show.legend = FALSE) + 
#   coord_cartesian(ylim = c(0,108),clip = 'off')
# dev.off()



# Look at the STM why not?
rm(list = ls())
gc()
load('../Data/Hearings/stm_results.RData')

toAnal <- out$meta %>%
  as_tibble() %>%
  mutate(fullInd = row_number()) %>%
  select(-textclean) %>%
  left_join(
    fit$theta %>%
      data.frame() %>%
      mutate(fullInd = row_number()) %>%
      as_tibble() %>% 
      gather(topic,theta,-fullInd) %>%
      mutate(topic = gsub('X','topic_',topic))) %>%
  group_by(fullInd) %>%
  filter(theta == max(theta))

require(lfe)

fit$theta

toplot0 <- summary(felm(as.formula(paste0('interrupted ~ ',
                               ' factor(topic) ',
                               ' + SENT_combAttack + SENT_combIncoh + SENT_combToxic',
                               '| opensecretsID + docID | 0 | opensecretsID + docID')),
             toAnal))$coefficients %>%
  data.frame() %>%
  mutate(topic = gsub('.*?\\)','',rownames(.))) %>%
  as_tibble() %>%
  rename(est = Estimate,
         se = Cluster.s.e.,
         tstat = t.value,
         pval = Pr...t..)

  
  

toAnal <- out$meta %>%
  as_tibble() %>%
  mutate(fullInd = row_number()) %>%
  select(-textclean) %>%
  left_join(
    fit$theta %>%
      data.frame() %>%
      mutate(fullInd = row_number()) %>%
      as_tibble() %>% 
      gather(topic,theta,-fullInd) %>%
      mutate(topic = gsub('X','topic_',topic)))

  
toplot0 <- toAnal %>%
  filter(!grepl('YELLEN',opensecretsID)) %>%
  group_by(topic,interrupted) %>%
  summarise(theta = mean(theta)) %>%
  ungroup() %>%
  spread(interrupted,theta,sep = '_') %>%
  mutate(diff = interrupted_1 - interrupted_0)

summary(prep)
toplot1 <- plot(prep,covariate = 'interrupted',
                topics = 1:100,
                method = 'pointestimate')
  
toplot2 <- plot(prep,covariate = 'comparisons',
     topics = 1:100,
     method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDBERNANKE')

toplot3 <- plot(prep,covariate = 'comparisons',
                topics = 1:100,
                method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDPOWELL')

toplot4 <- plot(prep,covariate = 'comparisons',
                topics = 1:100,
                method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDGREENSPAN')


unlist(toplot1$cis)

toplotFull <- NULL
for(i in 2:4) {
  
  toplotFull <- bind_rows(toplotFull,
                          as_tibble(data.frame(mInt = unlist(toplot1$means),
                                               lbInt = sapply(toplot1$cis,function(x) x[1]),
                                               ubInt = sapply(toplot1$cis,function(x) x[2])) %>%
                                      mutate(topic = paste0('topic_',row_number()))) %>%
                            left_join(as_tibble(data.frame(m = unlist(get(paste0('toplot',i))$means),
                                                           lb = sapply(get(paste0('toplot',i))$cis,function(x) x[1]),
                                                           ub = sapply(get(paste0('toplot',i))$cis,function(x) x[2])) %>%
                                                  mutate(topic = paste0('topic_',row_number()),
                                                         ref = ifelse(i == 2,'Bernanke',
                                                                      ifelse(i == 3,'Powell','Greenspan'))))))
}


pdf('../Paper/Figures/stm_SI.pdf',width = 9,height = 4)
toplotFull %>%
  left_join(toAnal %>%
  filter(!grepl('YELLEN',opensecretsID)) %>%
  group_by(topic) %>%
  summarise(theta = mean(theta)) %>%
  ungroup()) %>%
  mutate(sig = ifelse((lbInt > 0 | ubInt < 0) & (lb > 0 | ub < 0),1,0)) %>%
  ggplot(aes(x = m,y = mInt,size = theta,alpha = factor(sig))) + 
  geom_point() + 
  geom_vline(xintercept = 0,linetype = 'dashed') + 
  geom_hline(yintercept = 0,linetype = 'dashed') + 
  geom_errorbarh(aes(xmin = lb,xmax = ub),size = .5) + 
  geom_errorbar(aes(ymin = lbInt,ymax = ubInt),size = .5) + 
  geom_point(shape = 21,fill = 'white') + 
  scale_alpha_manual(name = 'Significant',values = c(.1,1),labels = c('Insig','Sig')) + 
  scale_size_continuous(name = 'Prevalence') + 
  facet_grid(~ref) + 
  ylab(bquote('' %<-% ' Topic Interrupted Less ... Topic Interrupted More ' %->% '')) + 
  xlab(bquote('' %<-% ' Topic Used More by Fed Chairs ... Topic Used More by Yellen ' %->% '')) + 
  theme_ridges() + 
  theme(axis.title.x = element_text(hjust = .5,vjust = 0),
        axis.title.y = element_text(hjust = .5,vjust = 0),
        legend.position = 'bottom')
dev.off()


toplot0 %>%
  filter(grepl('topic_',topic)) %>%
  left_join(toplotFull) %>%
  mutate(sig = ifelse(pval < 0.05 &
                        (lb > 0 | ub < 0),1,0)) %>%
  left_join(toAnal %>%
  group_by(topic) %>%
  summarise(theta = mean(theta))) %>%
  ggplot(aes(x = m,y = est,alpha = factor(sig),size = theta)) + 
  geom_vline(xintercept = 0,linetype = 'dashed') + 
  geom_hline(yintercept = 0,linetype = 'dashed') + 
  geom_errorbarh(aes(xmin = lb,xmax = ub),size = .5) + 
  geom_errorbar(aes(ymin = est - 2*se,ymax = est + 2*se),size = .5) + 
  geom_point(shape = 21,fill = 'white') + 
  scale_alpha_manual(name = 'Significant',values = c(.1,1),labels = c('Insig','Sig')) + 
  scale_size_continuous(name = 'Prevalence') + 
  facet_grid(~ref) + 
  ylab(bquote('' %<-% ' Topic Interrupted Less ... Topic Interrupted More ' %->% '')) + 
  xlab(bquote('' %<-% ' Topic Used More by Fed Chairs ... Topic Used More by Yellen ' %->% '')) + 
  theme_ridges() + 
  theme(axis.title.x = element_text(hjust = .5,vjust = 0),
        axis.title.y = element_text(hjust = .5,vjust = 0),
        legend.position = 'bottom')
